package com.kdtech.analyse.outside;
import com.alibaba.fastjson.JSONObject;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import com.kdtech.analyse.AnalyseNews;
import com.kdtech.analyse.JSoupUtils;
import com.kdtech.crawler.CrawlHTML;
import com.kdtech.crawler.at.UrlArgumentTop;
import com.kdtech.entity.crawler.UrlMeta;
import com.kdtech.entity.data.NewsMeta;
import com.kdtech.utils.DateUtils;
import com.kdtech.utils.HtmlCleaner;
import com.kdtech.utils.RegexUtils;
import com.kdtech.utils.StringUtils;

/**
 * @author KK
 *
 */
public class MingpaoAnalyse implements AnalyseNews {

	private static final String[] regex={
		"http://.*.mingpao.com/htm/INews/[0-9]{8}/[a-z]{2}[0-9]{5}[a-z].htm",
		"http://.*.mingpao.com/.*/article/[0-9]*/.*/[0-9]{5,}",
	};

	
	public boolean isDetailPage(String url) {
		return RegexUtils.matchAnyIgnoreCase(url, regex);
	}

	
	public NewsMeta parserHtml(UrlMeta urlMeta) {
		NewsMeta news=new NewsMeta();
		if (urlMeta.getHtml() == null) {
		}
		String htmltxt=urlMeta.getHtml();
		String url=urlMeta.getUrl();

		news.setUrl(url);
		String title=null;
		String content=null;
		String author=null;
		Long date=null;
		Document doc=Jsoup.parse(htmltxt);

		if (url.startsWith("http://inews.mingpao.com/htm")){
			title =doc.select("title").text();
			content=HtmlCleaner.getContentHtml(url,doc.select(".content_medium"));
			String dateStr=StringUtils.substringAfterLast(title, "-") + " " + StringUtils.substringBetween(title, "(", ")") ;
			date=DateUtils.matchDate(dateStr);
			if (title.contains("---")) 	title=StringUtils.substringAfterLast(title, "---");
			if (title.contains("(")) 		title=StringUtils.substringBefore(title, "(");
		}else{
			title =doc.select("title").text();
			if (title.contains("-")){
				title=StringUtils.substringAfterLast(title, "-");
			}
			String[] split=url.split("/");
//			http://news2.mingpao.com/ins/%sfs/web_tc/article/20141014/s00001/1413269402504
//			http://news2.mingpao.com/dat/ins/ins_web_tc/feed1/20141014/content.js
//			String u="http://news2.mingpao.com/dat/ins/ins_web_tc/article1/20140710/content_1404954304200.js?0";
			String u="http://news2.mingpao.com/dat/%s/%s_%s/article1/%s/content_%s.js?0";
			String ajaxUrl=String.format(u, split[3],split[3],split[5],split[7],split[9]);
			String getHtml=CrawlHTML.GetHtml(ajaxUrl);
			if (StringUtils.isNotBlank(getHtml)){
				try {
					JSONObject json=JSONObject.parseObject(getHtml);
					content=json.getString("DESCRIPTION");
					String PUBDATE=json.getString("PUBDATE");
					date=DateUtils.matchDate(PUBDATE);
				} catch (Exception e) {
					e.printStackTrace();
				}
			}

		}

		if (date==null){
			date=DateUtils.matchDate(url);
		}
		author=JSoupUtils.matchAuthor(doc, "来源：");
		news.setTitle(StringUtils.trimSpace(title));
		news.setContent(content);
		news.setAuthor(author);
		news.setDate(date);

		return news;
	}

	

	

	
}
